library(tidyverse)
library(data.table)
library(mice)
library(skimr)
library(corrplot)
library(cowplot)
bankraw <- read.csv("https://raw.githubusercontent.com/JaclynCoate/6372_Project_2/master/Data/bank-additional-full.csv", header = TRUE, sep = ";", strip.white = TRUE)
str(bankraw)
## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
head(bankraw)
##   age       job marital   education default housing loan   contact month
## 1  56 housemaid married    basic.4y      no      no   no telephone   may
## 2  57  services married high.school unknown      no   no telephone   may
## 3  37  services married high.school      no     yes   no telephone   may
## 4  40    admin. married    basic.6y      no      no   no telephone   may
## 5  56  services married high.school      no      no  yes telephone   may
## 6  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1         mon      261        1   999        0 nonexistent          1.1
## 2         mon      149        1   999        0 nonexistent          1.1
## 3         mon      226        1   999        0 nonexistent          1.1
## 4         mon      151        1   999        0 nonexistent          1.1
## 5         mon      307        1   999        0 nonexistent          1.1
## 6         mon      198        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 1         93.994         -36.4     4.857        5191 no
## 2         93.994         -36.4     4.857        5191 no
## 3         93.994         -36.4     4.857        5191 no
## 4         93.994         -36.4     4.857        5191 no
## 5         93.994         -36.4     4.857        5191 no
## 6         93.994         -36.4     4.857        5191 no

Question of Interest:

EDA to determine narrow down variables to use for the Logistic Regression model

Properly naming response variable

setnames(bankraw, "y", "Subscription")

Removing logically irrelevant variables

  • Upon reviewing the variables and their accompanying descriptions we have decided to remove ‘duration’. This is due to the fact that the time duration of a call would be a post performance metric that would not be something known in advance (before the subscription would or would not take place).
str(bankraw)
## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ Subscription  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
#Dropping logical irrelevant variables: "duration"
bankraw2 <- select(bankraw, -c("duration"))
head(bankraw2)
##   age       job marital   education default housing loan   contact month
## 1  56 housemaid married    basic.4y      no      no   no telephone   may
## 2  57  services married high.school unknown      no   no telephone   may
## 3  37  services married high.school      no     yes   no telephone   may
## 4  40    admin. married    basic.6y      no      no   no telephone   may
## 5  56  services married high.school      no      no  yes telephone   may
## 6  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week campaign pdays previous    poutcome emp.var.rate
## 1         mon        1   999        0 nonexistent          1.1
## 2         mon        1   999        0 nonexistent          1.1
## 3         mon        1   999        0 nonexistent          1.1
## 4         mon        1   999        0 nonexistent          1.1
## 5         mon        1   999        0 nonexistent          1.1
## 6         mon        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed Subscription
## 1         93.994         -36.4     4.857        5191           no
## 2         93.994         -36.4     4.857        5191           no
## 3         93.994         -36.4     4.857        5191           no
## 4         93.994         -36.4     4.857        5191           no
## 5         93.994         -36.4     4.857        5191           no
## 6         93.994         -36.4     4.857        5191           no
invisible(view(bankraw2))

NA Evaluation and Drop

#Checking for NAs
md.pattern(bankraw2)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##       age job marital education default housing loan contact month
## 41188   1   1       1         1       1       1    1       1     1
##         0   0       0         0       0       0    0       0     0
##       day_of_week campaign pdays previous poutcome emp.var.rate
## 41188           1        1     1        1        1            1
##                 0        0     0        0        0            0
##       cons.price.idx cons.conf.idx euribor3m nr.employed Subscription  
## 41188              1             1         1           1            1 0
##                    0             0         0           0            0 0
#Results show no NAs

Zero variance variable check - all show variance so remain in model

skim(bankraw2)
## Skim summary statistics
##  n obs: 41188 
##  n variables: 20 
## 
## ── Variable type:factor ─────────────────────────────────────────────────────────────────────────────────────────────────
##      variable missing complete     n n_unique
##       contact       0    41188 41188        2
##   day_of_week       0    41188 41188        5
##       default       0    41188 41188        3
##     education       0    41188 41188        8
##       housing       0    41188 41188        3
##           job       0    41188 41188       12
##          loan       0    41188 41188        3
##       marital       0    41188 41188        4
##         month       0    41188 41188       10
##      poutcome       0    41188 41188        3
##  Subscription       0    41188 41188        2
##                                   top_counts ordered
##                cel: 26144, tel: 15044, NA: 0   FALSE
##   thu: 8623, mon: 8514, wed: 8134, tue: 8090   FALSE
##          no: 32588, unk: 8597, yes: 3, NA: 0   FALSE
##  uni: 12168, hig: 9515, bas: 6045, pro: 5243   FALSE
##       yes: 21576, no: 18622, unk: 990, NA: 0   FALSE
##  adm: 10422, blu: 9254, tec: 6743, ser: 3969   FALSE
##        no: 33950, yes: 6248, unk: 990, NA: 0   FALSE
##   mar: 24928, sin: 11568, div: 4612, unk: 80   FALSE
##  may: 13769, jul: 7174, aug: 6178, jun: 5318   FALSE
##      non: 35563, fai: 4252, suc: 1373, NA: 0   FALSE
##                  no: 36548, yes: 4640, NA: 0   FALSE
## 
## ── Variable type:integer ────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete     n   mean     sd p0 p25 p50 p75 p100
##       age       0    41188 41188  40.02  10.42 17  32  38  47   98
##  campaign       0    41188 41188   2.57   2.77  1   1   2   3   56
##     pdays       0    41188 41188 962.48 186.91  0 999 999 999  999
##  previous       0    41188 41188   0.17   0.49  0   0   0   0    7
##      hist
##  ▂▇▆▃▁▁▁▁
##  ▇▁▁▁▁▁▁▁
##  ▁▁▁▁▁▁▁▇
##  ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ────────────────────────────────────────────────────────────────────────────────────────────────
##        variable missing complete     n     mean    sd      p0     p25
##   cons.conf.idx       0    41188 41188  -40.5    4.63  -50.8   -42.7 
##  cons.price.idx       0    41188 41188   93.58   0.58   92.2    93.08
##    emp.var.rate       0    41188 41188    0.082  1.57   -3.4    -1.8 
##       euribor3m       0    41188 41188    3.62   1.73    0.63    1.34
##     nr.employed       0    41188 41188 5167.04  72.25 4963.6  5099.1 
##      p50     p75    p100     hist
##   -41.8   -36.4   -26.9  ▁▅▆▃▇▁▁▁
##    93.75   93.99   94.77 ▁▁▅▅▁▇▁▂
##     1.1     1.4     1.4  ▁▁▃▁▁▁▁▇
##     4.86    4.96    5.04 ▂▃▁▁▁▁▁▇
##  5191    5228.1  5228.1  ▁▁▁▁▃▁▃▇

Continuous Variable Review

  • First we will review all continuous variables and see whic of those are the most associated with our categorical variable.
    • To accomplish this we will create boxplots of the individual continuous variables compared to the categorical response variable and look for large differences in the summary statistic ranges for the categorical variables.
plot(bankraw2$Subscription, bankraw2$age, xlab = "Subscription", ylab = "Age", title = "Subscription v Age", col=c(82,107)) 

#Upon review of the intial boxplot we see outlier that are preventing us from seeing the actual boxplot shape and move
plot(bankraw2$Subscription, bankraw2$campaign, xlab = "Subscription", ylab = "Campaign", title = "Subscription v Campaign", col=c(82,107)) 

#We drop all campaigns values >6 in an attempt to see the graph more clearly
campOutliers <- bankraw2[!(bankraw2$campaign > 6),]
invisible(campOutliers)
plot(campOutliers$Subscription, campOutliers$campaign, xlab = "Subscription", ylab = "Campaign < 25", title = "Subscription v Campaign < 25", col=c(82,107)) 

#Upon further review of this variable we are going to move forward with created a new variable that is 'Contacted' or 'Not Contacted' and evaluate as a categorical variable against 'Subscriptions'. This will be done in objective 1, but for now in the EDA we will leave as numeric.
plot(bankraw2$Subscription, bankraw2$pdays, xlab = "Subscription", ylab = "pdays", title = "Subscription v cons.conf.idx", col=c(82,107)) 

#In order to evaluate as numeric dropping outliers of 999 and regraphing the pdays numeric variable.
pdaysOutliers <- bankraw2[!(bankraw2$pdays == 999),]
invisible(pdaysOutliers)

plot(pdaysOutliers$Subscription, pdaysOutliers$pdays, xlab = "Subscription", ylab = "pdays", title = "Subscription v cons.conf.idx", col=c(82,107)) 

plot(bankraw2$Subscription, bankraw2$previous, xlab = "Subscription", ylab = "cons.conf.idx", title = "Subscription v cons.conf.idx", col=c(82,107)) 

plot(bankraw2$Subscription, bankraw2$cons.conf.idx, xlab = "Subscription", ylab = "cons.conf.idx", title = "Subscription v cons.conf.idx", col=c(82,107)) 

plot(bankraw2$Subscription, bankraw2$cons.price.idx, xlab = "Subscription", ylab = "cons.price.idx", title = "Subscription v cons.price.idx", col=c(82,107)) 

plot(bankraw2$Subscription, bankraw2$emp.var.rate, xlab = "Subscription", ylab = "cons.conf.idx", title = "Subscription v emp.var.rate", col=c(82,107)) 

plot(bankraw2$Subscription, bankraw2$euribor3m, xlab = "Subscription", ylab = "euribor3m", title = "Subscription v euribor3m", col=c(82,107)) 

plot(bankraw2$Subscription, bankraw2$nr.employed, xlab = "Subscription", ylab = "nr.employed", title = "Subscription v nr.employed", col=c(82,107)) 

  • An additional way to view which continuous variables are highly likely to influence the dependent variable is through density plots. The below function runs density plots for every continuous variable that is from above and anywehre there are spikes that are highly separated we know it is probable it will influence the dependent variable.
densityPlots <- function(df, explanatory, response){
df %>% ggplot(aes_string(x = explanatory, fill = response)) + geom_density(alpha=0.5)
}
densityPlotsList <- lapply(bankraw2 %>% keep(is.numeric) %>% colnames, function(x) densityPlots(bankraw2, x, "Subscription"))
for(i in densityPlotsList){
  print(i)
}

#Testing to see if function above works properly
#densityPlots(bankraw2, "age", "Subscription")

Continous Variable Removal

  • From the above boxplots and density plots we can see that there are some viable variables to leverage in predicitng our Subscription variable: listed below
    • “pdays”, “campaign”, “previous”, “cons.price.indx”, “cons.conf.idx”, “euribor3m”, “nr.employed”
  • This means the removal of the below metrics:
  • "
bankraw2 <- select(bankraw2, -c("age"))
skim(bankraw2)
## Skim summary statistics
##  n obs: 41188 
##  n variables: 19 
## 
## ── Variable type:factor ─────────────────────────────────────────────────────────────────────────────────────────────────
##      variable missing complete     n n_unique
##       contact       0    41188 41188        2
##   day_of_week       0    41188 41188        5
##       default       0    41188 41188        3
##     education       0    41188 41188        8
##       housing       0    41188 41188        3
##           job       0    41188 41188       12
##          loan       0    41188 41188        3
##       marital       0    41188 41188        4
##         month       0    41188 41188       10
##      poutcome       0    41188 41188        3
##  Subscription       0    41188 41188        2
##                                   top_counts ordered
##                cel: 26144, tel: 15044, NA: 0   FALSE
##   thu: 8623, mon: 8514, wed: 8134, tue: 8090   FALSE
##          no: 32588, unk: 8597, yes: 3, NA: 0   FALSE
##  uni: 12168, hig: 9515, bas: 6045, pro: 5243   FALSE
##       yes: 21576, no: 18622, unk: 990, NA: 0   FALSE
##  adm: 10422, blu: 9254, tec: 6743, ser: 3969   FALSE
##        no: 33950, yes: 6248, unk: 990, NA: 0   FALSE
##   mar: 24928, sin: 11568, div: 4612, unk: 80   FALSE
##  may: 13769, jul: 7174, aug: 6178, jun: 5318   FALSE
##      non: 35563, fai: 4252, suc: 1373, NA: 0   FALSE
##                  no: 36548, yes: 4640, NA: 0   FALSE
## 
## ── Variable type:integer ────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete     n   mean     sd p0 p25 p50 p75 p100
##  campaign       0    41188 41188   2.57   2.77  1   1   2   3   56
##     pdays       0    41188 41188 962.48 186.91  0 999 999 999  999
##  previous       0    41188 41188   0.17   0.49  0   0   0   0    7
##      hist
##  ▇▁▁▁▁▁▁▁
##  ▁▁▁▁▁▁▁▇
##  ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ────────────────────────────────────────────────────────────────────────────────────────────────
##        variable missing complete     n     mean    sd      p0     p25
##   cons.conf.idx       0    41188 41188  -40.5    4.63  -50.8   -42.7 
##  cons.price.idx       0    41188 41188   93.58   0.58   92.2    93.08
##    emp.var.rate       0    41188 41188    0.082  1.57   -3.4    -1.8 
##       euribor3m       0    41188 41188    3.62   1.73    0.63    1.34
##     nr.employed       0    41188 41188 5167.04  72.25 4963.6  5099.1 
##      p50     p75    p100     hist
##   -41.8   -36.4   -26.9  ▁▅▆▃▇▁▁▁
##    93.75   93.99   94.77 ▁▁▅▅▁▇▁▂
##     1.1     1.4     1.4  ▁▁▃▁▁▁▁▇
##     4.86    4.96    5.04 ▂▃▁▁▁▁▁▇
##  5191    5228.1  5228.1  ▁▁▁▁▃▁▃▇

Continuous Variable Multicollinearity Check

  • Multicollinearity will weaken the model
  • At first glance there does seem to be some correlation between a few of the continuous variables
  • When highlighting the yes versus no result for signing up, we cannot see a clear separation of anykind. This will lead us away from utilizing the principal componenet analysis technique for variable selection

  • The below pairs graphs shows us the separation of Yes and No results of the dependent variable by graph. We can observe that there is no clear separation and therefore PCA would not be a good variable reduction tool moving forward. So we continue with a standard EDA.

invisible(view(bankraw2))
#Reducing to only continuous variables and graphing by continuous variables, then colored by response in order to determine if there is separation of results and the ability to utilzie PCA
bankraw2 %>% keep(is.numeric) %>% pairs(,col=bankraw2$Subscription)

  • To additionally conintue to check multicollinearity we run a correlation matrix
    • Using the correlation matrix we can much more clearly see highly correlated variables in blue and orange gradient
#Plot numeric variables v numeric variables
bankraw2 %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", 
                                                   order="hclust", tl.srt=45, tl.cex = 0.8)

#Removing reviews_per_month due to high correlation of is and number_of_reviews
bank3 <- select(bankraw2, -c("pdays", "euribor3m", "nr.employed", "emp.var.rate"))
skim(bank3)
## Skim summary statistics
##  n obs: 41188 
##  n variables: 15 
## 
## ── Variable type:factor ─────────────────────────────────────────────────────────────────────────────────────────────────
##      variable missing complete     n n_unique
##       contact       0    41188 41188        2
##   day_of_week       0    41188 41188        5
##       default       0    41188 41188        3
##     education       0    41188 41188        8
##       housing       0    41188 41188        3
##           job       0    41188 41188       12
##          loan       0    41188 41188        3
##       marital       0    41188 41188        4
##         month       0    41188 41188       10
##      poutcome       0    41188 41188        3
##  Subscription       0    41188 41188        2
##                                   top_counts ordered
##                cel: 26144, tel: 15044, NA: 0   FALSE
##   thu: 8623, mon: 8514, wed: 8134, tue: 8090   FALSE
##          no: 32588, unk: 8597, yes: 3, NA: 0   FALSE
##  uni: 12168, hig: 9515, bas: 6045, pro: 5243   FALSE
##       yes: 21576, no: 18622, unk: 990, NA: 0   FALSE
##  adm: 10422, blu: 9254, tec: 6743, ser: 3969   FALSE
##        no: 33950, yes: 6248, unk: 990, NA: 0   FALSE
##   mar: 24928, sin: 11568, div: 4612, unk: 80   FALSE
##  may: 13769, jul: 7174, aug: 6178, jun: 5318   FALSE
##      non: 35563, fai: 4252, suc: 1373, NA: 0   FALSE
##                  no: 36548, yes: 4640, NA: 0   FALSE
## 
## ── Variable type:integer ────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete     n mean   sd p0 p25 p50 p75 p100     hist
##  campaign       0    41188 41188 2.57 2.77  1   1   2   3   56 ▇▁▁▁▁▁▁▁
##  previous       0    41188 41188 0.17 0.49  0   0   0   0    7 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ────────────────────────────────────────────────────────────────────────────────────────────────
##        variable missing complete     n   mean   sd    p0    p25    p50
##   cons.conf.idx       0    41188 41188 -40.5  4.63 -50.8 -42.7  -41.8 
##  cons.price.idx       0    41188 41188  93.58 0.58  92.2  93.08  93.75
##     p75   p100     hist
##  -36.4  -26.9  ▁▅▆▃▇▁▁▁
##   93.99  94.77 ▁▁▅▅▁▇▁▂
EUbank3 <- select(bankraw2, -c("pdays", "nr.employed", "emp.var.rate"))
  • After reviewing the need to remove the below variables is clear. Due to their correlation with eachother, they could weaken our model
    • “euribor3m”, “nr.employed”, “emp.var.rate”
  • Additionally, seeing the “pdays” and previous are highly correlated we choose to remove “pdays” since we think this will be more useful as a categorical variable in Object 2
  • See correlation matrix below after correlated continuous variables have been removed
#Plot numeric continuous variables to double check all correlated values have been removed
bank3 %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", order="hclust", tl.srt=45, tl.cex = 0.8)

#Plot numeric continuous variables to double check all correlated values have been removed
EUbank3 %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", order="hclust", tl.srt=45, tl.cex = 0.8)

Categorical Variable Review

# 1. Name target variable
#targetCatCat <- "Subscription"

# 2. Name explanatory variable
#explanatory <- bank3 %>% keep(is.factor) %>% colnames

# 3. Create function
numCatCat <- function(df, explanatory, response) {
  ggplot(data = df) +geom_bar(aes_string(x = explanatory, fill = response), position = "fill", alpha = 0.9) + coord_flip() #+ xlab(explanatory)
}

#  # 3a. Example of working function above
#  # numCatCat(bank3, explanatory = "education", response = "Subscription")


# 4. Create plot list for plot_grid function to reference
#plotlistCatCat <- lapply(explanatory, function(x) numCatCat(bank3, x, targetCatCat))

# 5. Grid of all categorical variables plotted against y = Subscription
#plot_grid(plotlist = plotlistCatCat)
  • Singular break downs of the above function
head(bank3)
##         job marital   education default housing loan   contact month
## 1 housemaid married    basic.4y      no      no   no telephone   may
## 2  services married high.school unknown      no   no telephone   may
## 3  services married high.school      no     yes   no telephone   may
## 4    admin. married    basic.6y      no      no   no telephone   may
## 5  services married high.school      no      no  yes telephone   may
## 6  services married    basic.9y unknown      no   no telephone   may
##   day_of_week campaign previous    poutcome cons.price.idx cons.conf.idx
## 1         mon        1        0 nonexistent         93.994         -36.4
## 2         mon        1        0 nonexistent         93.994         -36.4
## 3         mon        1        0 nonexistent         93.994         -36.4
## 4         mon        1        0 nonexistent         93.994         -36.4
## 5         mon        1        0 nonexistent         93.994         -36.4
## 6         mon        1        0 nonexistent         93.994         -36.4
##   Subscription
## 1           no
## 2           no
## 3           no
## 4           no
## 5           no
## 6           no
numCatCat(bank3, explanatory = "job", response = "Subscription")

numCatCat(bank3, explanatory = "marital", response = "Subscription")

numCatCat(bank3, explanatory = "education", response = "Subscription")

numCatCat(bank3, explanatory = "default", response = "Subscription")

numCatCat(bank3, explanatory = "housing", response = "Subscription")

numCatCat(bank3, explanatory = "loan", response = "Subscription")

numCatCat(bank3, explanatory = "month", response = "Subscription")

numCatCat(bank3, explanatory = "day_of_week", response = "Subscription")

numCatCat(bank3, explanatory = "poutcome", response = "Subscription")

  • Upon reviewing all of the Categorical variables we can clearly remove the below variables
    • marital, housing, loan, day_of_week
  • While the below variables seem to show strong correlation with the response variable
    • job, eduction, default, month, poutcome
bank4 <- select(bank3, -c("marital", "housing", "loan", "day_of_week"))
EUbank4 <- select(EUbank3, -c("marital", "housing", "loan", "day_of_week"))

Summary Check on Variables

summary(bank4)
##           job                      education        default     
##  admin.     :10422   university.degree  :12168   no     :32588  
##  blue-collar: 9254   high.school        : 9515   unknown: 8597  
##  technician : 6743   basic.9y           : 6045   yes    :    3  
##  services   : 3969   professional.course: 5243                  
##  management : 2924   basic.4y           : 4176                  
##  retired    : 1720   basic.6y           : 2292                  
##  (Other)    : 6156   (Other)            : 1749                  
##       contact          month          campaign         previous    
##  cellular :26144   may    :13769   Min.   : 1.000   Min.   :0.000  
##  telephone:15044   jul    : 7174   1st Qu.: 1.000   1st Qu.:0.000  
##                    aug    : 6178   Median : 2.000   Median :0.000  
##                    jun    : 5318   Mean   : 2.568   Mean   :0.173  
##                    nov    : 4101   3rd Qu.: 3.000   3rd Qu.:0.000  
##                    apr    : 2632   Max.   :56.000   Max.   :7.000  
##                    (Other): 2016                                   
##         poutcome     cons.price.idx  cons.conf.idx   Subscription
##  failure    : 4252   Min.   :92.20   Min.   :-50.8   no :36548   
##  nonexistent:35563   1st Qu.:93.08   1st Qu.:-42.7   yes: 4640   
##  success    : 1373   Median :93.75   Median :-41.8               
##                      Mean   :93.58   Mean   :-40.5               
##                      3rd Qu.:93.99   3rd Qu.:-36.4               
##                      Max.   :94.77   Max.   :-26.9               
## 

Export data set for simple logistic analysis

write.csv(bank4, file="/Users/Jaco/Desktop/SMU/Spring2020/DS_6372_Applied_Statistics/Project.2/Data/simplelogic.csv")
write.csv(EUbank4, file="/Users/Jaco/Desktop/SMU/Spring2020/DS_6372_Applied_Statistics/Project.2/Data/simplelogicEU3.csv")